In general, a stable financial system is an important prerequisite for economic growth and stability. Banks play an essential role in the financial system of every country, and as highlighted by the global financial crisis, they can cause the global economic crisis. For example, Lehman Brothers bankruptcy filing was the largest in US history and considered to have played a decisive role in the unfolding of the financial crisis of 2007–2008. That is why supervisors and regulator authorities try to predict the probability of bank default to apply appropriate mikro or macroprudential policies.
The goal of this project to predict bank default using Machine Learning algorithms.
# Packages
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as py
import matplotlib.pyplot as plt
import scipy.stats as stats
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.metrics import accuracy_score,recall_score,f1_score,precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
#Data
df = pd.read_csv('C:\\Users\\Taguhi\\Desktop\\ML Individual\\data.csv')
df.head()
df.shape
df.info()
There are no missing values in the dataset
## Feature Statistics
feature_df = df.drop('Bankrupt?',axis = 1).describe().T
feature_df = feature_df.reset_index().rename(columns = {'index' : 'columns'})
feature_df['distinct_vals'] = feature_df['columns'].apply(lambda x : len(df[x].value_counts()))
feature_df['column_var'] = feature_df['columns'].apply(lambda x : np.var(df[x]))
feature_df['column_std'] = feature_df['columns'].apply(lambda x : np.std(df[x]))
feature_df['column_mean'] = feature_df['columns'].apply(lambda x : np.mean(df[x]))
feature_df['target_corr'] = feature_df['columns'].apply(lambda x : stats.pointbiserialr(df['Bankrupt?'], df[x])[0])
feature_df.head()
len(feature_df[feature_df['column_var'].astype(float) == 0.0])
feature_df[feature_df['column_var'].astype(float) == 0.0]
df.drop(' Net Income Flag',axis = 1, inplace = True)
The Net Income Flag has constant values, so it is useless we will drop it.
feature_df = feature_df.sort_values('column_var', ascending = True)
feature_df['column_var'] = (feature_df['column_var'] - feature_df['column_var'].min()) / (feature_df['column_var'].max() - feature_df['column_var'].min())
trace1 = go.Scatter(x=feature_df['columns'], y=feature_df['column_var'], opacity=0.75, marker=dict(color="red"))
layout = dict(height=400, title='Feature Variance', legend=dict(orientation="h"));
fig = go.Figure(data=[trace1], layout=layout);
py.iplot(fig);
One of the statistical intution is that if the feature variance is very less, then the feature will add less contribution to the model. We see that most of our features have quite low variance.
trace1 = go.Histogram(x=feature_df['target_corr'], opacity=0.45, marker=dict(color="red"))
layout = dict(height=400, title='Distribution of correlation with target', legend=dict(orientation="h"));
fig = go.Figure(data=[trace1], layout=layout);
py.iplot(fig);
Also many features have almost 0 correlation with the target variable.
selected =feature_df[(feature_df.target_corr > 0.05) | (feature_df.target_corr < -0.05) & (feature_df.column_var > 0.00001)]
selected.shape
trace1 = go.Histogram(x=selected['target_corr'], opacity=0.45, marker=dict(color="green"))
layout = dict(height=400, title='Distribution of correlation with target', legend=dict(orientation="h"));
fig = go.Figure(data=[trace1], layout=layout);
py.iplot(fig);
df_selected=df[df.columns.intersection(selected['columns'])]
df_selected['default'] = df['Bankrupt?']
df_selected
plt.figure(figsize=(15,8))
sns.heatmap(df_selected.corr(), annot=True,cmap="RdBu")
plt.show()
There are highly correlated columns such as Liability to Equity and Borrowing dependency, Equity to Long-term Liability and Borrowing dependency, Current Liability to Equity and Borrowing dependency, Borrowing dependency and Inventory and accounts receivable/Net value, Debt ratio % and Current Liability to Assets and so on. We will drop one of this columns.
df_selected.drop([' Borrowing dependency',
' Liability to Equity',' Current Liability to Assets',
' Current Liabilities/Equity'], axis = 1, inplace = True)
plt.figure(figsize=(15,8))
sns.heatmap(df_selected.corr(), annot=True,cmap="RdBu")
plt.show()
# Description of the variables
feature_df = df_selected.drop('default',axis = 1).describe().T
feature_df = feature_df.reset_index().rename(columns = {'index' : 'columns'})
feature_df['distinct_vals'] = feature_df['columns'].apply(lambda x : len(df_selected[x].value_counts()))
feature_df['target_corr'] = feature_df['columns'].apply(lambda x : stats.pointbiserialr(df['Bankrupt?'], df[x])[0])
feature_df
df_selected.info()
# Changing types of target and categorical variables
df_selected.default = df_selected.default.astype("O")
df_selected[' Liability-Assets Flag'] = df_selected[' Liability-Assets Flag'].astype("O")
for column in df_selected.drop(["default",' Liability-Assets Flag'],axis =1).columns:
fig=px.box(df_selected, x='default', y=column)
fig.show()
# columns = df_selected.columns[:-1]
# for name in columns:
# Q1=df_selected[name].quantile(0.25)
# Q3=df_selected[name].quantile(0.75)
# IQR=Q3-Q1
# Lower_Whisker = Q1 - 1.5 * IQR
# Upper_Whisker = Q3 + 1.5 * IQR
# df_selected = df_selected[df_selected[name]< Upper_Whisker]
# df_selected = df_selected[df_selected[name]> Lower_Whisker]
cols = df_selected.columns[:4]
sns.pairplot(df_selected, vars = cols, hue = 'default')
df_selected.columns
cross = pd.crosstab(df_selected['default'], df_selected[' Liability-Assets Flag']).apply(lambda r: r/r.sum(), axis=1)
plt.figure(figsize = (5, 3))
s = sns.heatmap(cross,
annot = True,
cmap = 'RdBu',
vmin = 0,
vmax = 1)
If there is not flag bank is more likely not to go default, meanwhile when there is flag bank is more likely to go default.
plt.figure(figsize=(10,8))
sns.countplot(df_selected.default, palette = "rocket")
plt.title("Target variable distribution")
plt.show()
# Target variable distribution by percentages
df_selected.default.value_counts(normalize=True).mul(100)
target = df_selected['default'].astype(int)
X = df_selected.drop(columns='default')
X = pd.get_dummies(X, columns=[' Liability-Assets Flag'])
train_x,test_x,train_y,test_y = train_test_split(X,target,test_size=0.25,random_state=42)
train_x = StandardScaler().fit(train_x).transform(train_x)
test_x = StandardScaler().fit(test_x).transform(test_x)
print(f"Number of observations \n Train set: {len(train_x)}\n Test set: {len(test_x)}")
Logistic = LogisticRegression(random_state=42)
Knn = KNeighborsClassifier(n_neighbors=5)
RFC = RandomForestClassifier(n_estimators=100, random_state=42)
from sklearn.svm import SVC
svc=SVC(C=100.0)
def results(cls_list):
indx = [ 'Acc_train', 'Acc_test', 'Sensitivity', 'Specificity']
results = pd.DataFrame(index=indx, columns=['Logistic', 'Knn', 'RFC'])
for i, cls in enumerate(cls_list):
cls.fit(train_x,train_y)
test_y_new = cls.predict(test_x)
train_y_new = cls.predict(train_x)
acc_train = accuracy_score(train_y,train_y_new)
acc_test = accuracy_score(test_y,test_y_new)
sensitivity = recall_score(test_y,test_y_new)
specificity = recall_score(test_y,test_y_new, pos_label =0)
name = results.columns[i]
results[name] = [acc_train, acc_test, sensitivity, specificity]
return results
classifiers = [Logistic, Knn, RFC]
res = results(classifiers)
plt.figure(figsize = (10, 5))
s = sns.heatmap(res,
annot = True,
cmap = 'RdBu',
vmin = 0,
vmax = 1)
There is obvious overfitting in the models
svc=SVC(C=100.0)
# fit classifier to training set
svc.fit(train_x,train_y)
# make predictions on test set
y_pred=svc.predict(test_x)
# compute and print accuracy score
print('Training set accuracy: {:.4f}'.format(svc.score(train_x, train_y)))
print('Test set accuracy: {:.4f}'.format(svc.score(test_x, test_y)))
cm = confusion_matrix(test_y, y_pred)
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]
recall = TP / float(TP + FN)
specificity = TN / (TN + FP)
print('Recall or Sensitivity : {0:0.4f}'.format(recall))
print('Specificity : {0:0.4f}'.format(specificity))
majority = df_selected[df_selected.default==0]
minority = df_selected[df_selected.default==1]
downsampled = resample(majority,
replace=False,
n_samples=220,
random_state=42)
downsampled = pd.concat([downsampled, minority])
target = downsampled['default'].astype(int)
X = downsampled.drop(columns='default')
train_x,test_x,train_y,test_y = train_test_split(X,target,test_size=0.25,random_state=42)
train_x = StandardScaler().fit(train_x).transform(train_x)
test_x = StandardScaler().fit(test_x).transform(test_x)
res = results(classifiers)
plt.figure(figsize = (10, 5))
s = sns.heatmap(res,
annot = True,
cmap = 'Blues',
vmin = 0,
vmax = 1)
target = df_selected['default'].astype(int)
X = df_selected.drop(columns='default')
train_x,test_x,train_y,test_y = train_test_split(X,target,test_size=0.25,random_state=42)
train = pd.concat([train_x, train_y], axis = 1)
majority = train[train.default==0]
minority = train[train.default==1]
upsampled2 = resample(minority,
replace=True,
n_samples=2141,
random_state=42)
upsampled2 = pd.concat([majority,upsampled2])
train_y = upsampled2['default'].astype(int)
train_x = upsampled2.drop(columns='default')
train_x = StandardScaler().fit(train_x).transform(train_x)
test_x = StandardScaler().fit(test_x).transform(test_x)
res = results(classifiers)
plt.figure(figsize = (10, 5))
s = sns.heatmap(res,
annot = True,
cmap = 'Blues',
vmin = 0,
vmax = 1)
The best result was from logistic regression with downsampling.